HTML Oddmuse to TiddlyWiki Conversion

1st January 2023 at 4:51pm
2022-12-10 prototype "HTML-oddmuse-to-tiddlywiki-conversion.prl" code

#! /usr/bin/perl

# oddmuse-HTML-to-tiddlywiki-conversion.prl
# ^z - 2003-12-31 2007-07-15 2007-12-25 2022-11-27 2022-12-07 2022-12-10
#
# usage:  perl oddmuse-HTML-to-tiddlywiki-conversion.prl indir outdir
#
# take all files in "indir", convert contents, replace '.html' with '.tid' at the end of their names, store results in "outdir"
# indir and outdir must already exist
#
# process:
#    mass-export Oddmuse ZhurnalyWiki files as "static" HTML files to directory "indir"
#    for each individual HTML file:
#        make the first line of each file say title: New-wiki-page-name\n\n
#           (where New-wiki-page-name is the file name minus .html with "_" changed to " ")
#        delete HTML header stuff down to and including <div class="content browse"><p>
#        delete HTML footer stuff starting with </div><div class="wrapper close"> to the end
#        delete all </p>
#        fix external links like <a class="url http outside" href="URL">label</a>
#        fix internal links like <a class="local" href="URL">Wiki Page Name</a>
#        do other conversion such as <strong>...</strong> or <em>...</em> as desired (see below)
#    replace .html with .tid at the end of each file name
#    drag-and-drop into an empty TiddlyWiki to import!

$indir = $ARGV[0];
$outdir = $ARGV[1];
opendir(INDIR, "$indir") or die "couldn't open input directory $indir";
opendir(OUTDIR, "$outdir") or die "couldn't open output directory $outdir";
@pages = grep !/^\./, readdir INDIR;
undef $/;  # grab entire file at once

foreach $page (@pages) {
  if ( -e "$indir/$page" ) {
    open(F, "$indir/$page") or die "$page: $!";
    print "  $page ... ";
    $body = <F>;
    close(F);

# convert Oddmuse ZhurnalyWiki HTML to TiddlyWiki markup (mostly!)
# remove prefix HTML to <div class="content browse">
    $body =~ s/^.*?<div class="content browse">//sg;
# remove suffix HTML from </div><div class="wrapper close">
    $body =~ s/<\/div><div class="wrapper close">.*$//sg;
# change all <p> to \n\n
    $body =~ s/<p>/\n\n/sg;
# remove all </p>
    $body =~ s/<\/p>//sg;
# change external links to TiddlyWiki markup
    $body =~ s/<a class="url http outside" href="(.*?)">(.*?)<\/a>/[[$2\|$1]]/sg;
# change local links to TiddlyWiki markup (ASSUME label is proper Tiddler name!)
    $body =~ s/<a class="local".*?>(.*?)<\/a>/[[$1]]/sg;
# fix BOLD (STRONG)
    $body =~ s/<strong>(.*?)<\/strong>/''$1''/sg;
# fix ITALIC (EM)
    $body =~ s/<em>(.*?)<\/em>/\/\/$1\/\//sg;
# fix BLOCKQUOTE
    $body =~ s/<blockquote>(.*?)<\/blockquote>/\n\n<<<\n$1\n<<<\n/sg;
# fix LIST (ASSUME list is all one-level UNORDERED bullets!)
    $body =~ s/<ul>/\n\n/sg;
    $body =~ s/<ol>/\n\n/sg;
    $body =~ s/<li>(.*?)<\/li>/\n* $1\n/sg;
    $body =~ s/<\/ul>/\n\n/sg;
    $body =~ s/<\/ol>/\n\n/sg;
# fix HORIZONTAL RULE and put empty line above it and \n after it
    $body =~ s/<hr \/>/\n\n---\n/sg;
# fix HEADER markup
    $body =~ s/<h1>/\n\n!/sg;
    $body =~ s/<h2>/\n\n!!/sg;
    $body =~ s/<h3>/\n\n!!!/sg;
    $body =~ s/<h4>/\n\n!!!!/sg;
    $body =~ s/<h5>/\n\n!!!!!/sg;
    $body =~ s/<h6>/\n\n!!!!!!/sg;
    $body =~ s/<\/h[1-6]>/\n/sg;
# fix ESCAPED words
    $body =~ s/<code>(.+?)<\/code>/``$1``/sg;
# fix ESCAPED lines
    $body =~ s/<pre class="real">/\n\n```\n/sg;
    $body =~ s/<\/pre>/\n```\n/sg;

=begin
# GET RID OF OLD STUFF some day!!!

# convert Oddmuse markup to TiddlyWiki markup, as much as is reasonably possible
# BOLD
    $body =~ s/\*\*/\'\'/sg; # BOLD:  ** becomes ''
# BLOCKQUOTE
    $body =~ s/\"\"\"/\n<<</sg; # BLOCKQUOTE: """ becomes \n<<<
# UNORDERED LIST
    $body =~ s/\n\*([^*])/\n\n\*$1/sg; # BULLET LIST: initial * gets a \n above
# ORDERED LIST
    $body =~ s/\n\#([^#])/\n\n\#$1/sg; # NUMBER LIST: initial # gets a \n above
# HORIZONTAL RULE
    $body =~ s/\n----/\n\n----/sg; # HORIZONTAL RULE: put a blank line above
# HEADERS
    $body =~ s/\n======/\n\n!!!!!!/sg; # HEADER 6: change = to ! and put blank line above
    $body =~ s/\n=====/\n\n!!!!!/sg; # HEADER 5: change = to ! and put blank line above
    $body =~ s/\n====/\n\n!!!!/sg; #HEADER 4: change = to ! and put blank line above
    $body =~ s/\n===/\n\n!!!/sg; # HEADER 3: change = to ! and put blank line above
    $body =~ s/\n==/\n\n!!/sg; # HEADER 2: change = to ! and put blank line above
    $body =~ s/\n=/\n\n!/sg; # HEADER 1: change = to ! and put blank line above
# Reference LINK [URL] (avoid greedy pattern match!)
    $body =~ s/[^[]\[(http.+?)]/\[\[\*\|$1]]/sg; # [URL] --> [[*|URL]]
# Labeled URL (avoid greedy pattern match!)
    $body =~ s/\[\[(http.+?)\|(.+?)\]\]/[[$2\|$1]]/sg; # LINK [[URL|label]] --> [[label|URL]]
# Inline IMAGE LINK
    $body =~ s/(\shttp.+?\.jpg)/[img\[$1]]/sg; # URL.jpg --> [img[URL.jpg]]
    $body =~ s/(\shttp.+?\.png)/[img\[$1]]/sg; # URL.png --> [img[URL.png]]
# Displayed IMAGE with LINK
    $body =~ s/\[\[image:(http.+?)\|(.*?)\|(http.+?)\]\]/<a href="$3" target="_blank">[img["$2"\|$1]]<\/a>/sg; # [[image:URL-displayed-image|label_text|URL-linked-image]] --> <a href="URL-linked-image" target="_blank">[img["label_text"|URL-displayed-image]]</a>
# TABLES
    $body =~ s/^\|(.*?)\n(.*?)\|/\|$1<br>$2\|/sg; # in a table cell \n --> <br>
    $body =~ s/^\|(.*?)\n(.*?)\n(.*?)\|/\|$1<br>$2<br>$3\|/sg; # and \n...\n --> <br>...<br>
        # (worry later about 3 or more \n in a table cell)
    $body =~ s/\|(.*?)\\\\(.*?)\|/\|$1<br>$2\|/sg; # in a table cell \\ --> <br>
    $body =~ s/\|(.*?)\\\\(.*?)\\\\(.*?)\|/\|$1<br>$2<br>$3\|/sg; # and \\...\\  --> <br>...<br>
                # (worry later about 3 or more \\ in a table cell)
    $body =~ s/^(\|.+?\|)$/\n$1/msg; # (note "msg"!) put blank line above every table row
        # (worry later about blank lines within tables
# ESCAPED WORDS
    $body =~ s/{{{(.+?)}}}/``$1``/sg; # {{{...}}} --> ``...``
=cut

# title the Tiddler using the file name without ".html" suffix and with "_" --> " "
    $p_a_g_e = $page;
    $p_a_g_e =~ s/\.html$//;
    $p_a_g_e =~ s/_/ /g;
    $body = "title: $p_a_g_e\n\n$body"; # spaced-out-title at top followed by \n

# write the results to *.tid
    open(F, ">$outdir/$page.tid") or die "$page: $!";
    print F $body;
    close(F);
  } else {
      die "$page didn't exist: $!";
  }
  print "\n";
}